5. Main Analysis
(1) Overall Insight
NYC is a city with prosperous education development, ranging from 5-17 primary education, higher education and senior education. Here we could see the student population proportion of people in age 5-17 and senior people to NYC total population.
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(viridis)
library(grid)
library(gridExtra)
library(mi)
library(RColorBrewer)
library(scales)
library(extracat)
library(knitr)
library(vcd)
library(GGally)
#load data
loc <- read.csv("location.csv", header=TRUE, stringsAsFactors=FALSE)
loc <- loc[c(2, 3, 15, 17, 23 )]
colnames(loc) <- c("DBN", "DB", "Borough", "Zipcode", "District")
# clean
ages <- read.csv("Age.csv", header=TRUE, stringsAsFactors=FALSE)
Age2 <- ages[c(4,6,8,10,12,14,16,18,20,22)]
Age1 <- ages[c(1, 2)]
Age2 <- data.frame(lapply(Age2, function(x) as.numeric(sub("%", "", x))/100) )
Age <- cbind(Age1, Age2)
Age[Age == "NYC Total"]=" NYC Total"
colnames(Age) <- c("Age", "Borough", "1950", "1960", "1970", "1980", "1990", "2000", "2010", "2020", "2030", "2040")
AgeData <- Age[c(1,2,3,4,5,6,7,8,9)]
#plot
age_f <- Age[1:9] %>% gather(Year, Percent,3:9)
age_f[age_f == "School-Age (5 to 17)"] = "5-17"
age_f[age_f == "65 and over"] = "over 65"
age_f$Year = as.numeric(age_f$Year)
ggplot(age_f, aes(x = Year, y = Percent, col = Borough))+geom_line(size = 0.7, aes(linetype = Age))+scale_x_continuous(breaks = seq(1950, 2010, 10))+
labs(title = " Percent of Total Population 1950-2010", linetype = "Age Group")
At first, we used the histogram to show the proportion of different years, but it cannot show the trend of the proportion change over these years. Therefore, we chose to visualize the data with time series. Apparently, it can give us insight into the data.
The figure above shows the change of percent of total population of two age groups, 5-17 and over 65, from 1950 to 2010. From the figure, we could figure out that in general, young group’s percent has been decreasing over these years. Specifically, the young’s percent reached a peak in 1970 and 2000, which is consistent with the fact that the growth rate reached a peak in the 1970s and 1990s.
(This information is from the data here https://www.google.com/publicdata/explore?ds=d5bncppjof8f9_&met_y=sp_pop_grow&idim=country%3AUSA%3AIND%3ACAN&hl=en&dl=en#!ctype=l&strail=false&bcs=d&nselm=h&met_y=sp_pop_grow&scale_y=lin&ind_y=false&rdim=region&idim=country:USA&ifdim=region&hl=en_US&dl=en&ind=false).
And old group’s percent has been increasing over these years, which is consistent with the ageing of population.
Then we use time-series to analyze future education condition.
#prediction
age <- Age %>% gather(Year, Percent,3:12)
age[age == "School-Age (5 to 17)"] = "5-17"
age[age == "65 and over"] = "over 65"
age$Year = as.numeric(age$Year)
ggplot(age, aes(x = Year, y = Percent, col = Borough))+geom_line(size = 0.7, aes(linetype = Age))+scale_x_continuous(breaks = seq(1950, 2040, 10))+
labs(title = " Percent of Total Population 1950-2040", linetype = "Age Group") +
theme_grey(16)
In this figure, data predicted from the ageing of population is added.
(2) Math and ELA Score Analysis in Grade 3 ~ 8 from 2006 ~ 2016
#import math dataset
column_name = c("District", "Grade", "Year", "Category", "Number_Tested", "Mean_Scale_Score", "level1", "level1_per", "level2", "level2_per", "level3", "level3_per", "level4", "level4_per", "level3&4", "level3&4_per")
#data of 2013-2016
math_all_stu_1316 <- read.csv("math_all_stu2013-2016.csv", skip = 6, col.names = column_name)
math_ethnicity_1316 <- read.csv("math_ethnicity13-16.csv", skip = 6, col.names = column_name)
math_gender_1316 <- read.csv("math_gender13-16.csv", skip = 6, col.names = column_name)
math_swd_1316 <- read.csv("math_swd13-16.csv", skip = 6, col.names = column_name)
math_ell_1316 <- read.csv("math_ell13-16.csv", skip = 6, col.names = column_name)
math_all_stu_1316[math_all_stu_1316 == "s"] = NA
math_ethnicity_1316[math_ethnicity_1316 == "s"] = NA
math_gender_1316[math_gender_1316 == "s"] = NA
math_swd_1316[math_swd_1316 == "s"] = NA
math_ell_1316[math_ell_1316 == "s"] = NA
math_all_stu_1316$Mean_Scale_Score = as.numeric(as.character(math_all_stu_1316$Mean_Scale_Score))
math_ethnicity_1316$Mean_Scale_Score = as.numeric(as.character(math_ethnicity_1316$Mean_Scale_Score))
math_gender_1316$Mean_Scale_Score = as.numeric(as.character(math_gender_1316$Mean_Scale_Score))
math_swd_1316$Mean_Scale_Score = as.numeric(as.character(math_swd_1316$Mean_Scale_Score))
math_ell_1316$Mean_Scale_Score = as.numeric(as.character(math_ell_1316$Mean_Scale_Score))
math_ell_1316$level1 = as.numeric(as.character(math_ell_1316$level1))
math_ell_1316$level2 = as.numeric(as.character(math_ell_1316$level2))
math_ell_1316$level3 = as.numeric(as.character(math_ell_1316$level3))
math_ell_1316$level4 = as.numeric(as.character(math_ell_1316$level4))
#data of 2006-2012
math_all_stu_0612 <- read.csv("math_all_stu_0612.csv", skip = 6, col.names = column_name)
math_ethnicity_0612 <- read.csv("math_ethnicity_0612.csv", skip = 6, col.names = column_name)
math_gender_0612 <- read.csv("math_gender_0612.csv", skip = 6, col.names = column_name)
math_swd_0612 <- read.csv("math_swd06-12.csv", skip = 6, col.names = column_name)
math_ell_0612 <- read.csv("math_ell06-12.csv", skip = 6, col.names = column_name)
math_all_stu_0612[math_all_stu_0612 == "s"] = NA
math_ethnicity_0612[math_ethnicity_0612 == "s"] = NA
math_gender_0612[math_gender_0612 == "s"] = NA
math_swd_0612[math_swd_0612 == "s"] = NA
math_ell_0612[math_ell_0612 == "s"] = NA
math_all_stu_0612$Mean_Scale_Score = as.numeric(as.character(math_all_stu_0612$Mean_Scale_Score))
math_ethnicity_0612$Mean_Scale_Score = as.numeric(as.character(math_ethnicity_0612$Mean_Scale_Score))
math_gender_0612$Mean_Scale_Score = as.numeric(as.character(math_gender_0612$Mean_Scale_Score))
math_swd_0612$Mean_Scale_Score = as.numeric(as.character(math_swd_0612$Mean_Scale_Score))
math_ell_0612$Mean_Scale_Score = as.numeric(as.character(math_ell_0612$Mean_Scale_Score))
math_ethnicity_1316 <- data.frame(math_ethnicity_1316)
Here we processed original Math score data. During the process of input math data, we removed several useless lines at the beginning of the csv, and reset the column names. Then, we set the missing data, which was “s” in the original dataset, as “NA”s in order to avoid errors in following analyses. Next, I reset the type of columns for further analysis, i.e. we set the columns with missing value as numeric (which is factor because of “s”) to avoid mistakes in following numerical calculation. And here is a trick that when converting type factor to type numeric, you have to firstly change it to type character then type numeric, because R treat values with type factor as labels and ignore the value of it. If you set factor as numeric, the numeric value is random, which is not appropriate to our analyses.
##ELA
#import ELA dataset
column_name = c("District", "Grade", "Year", "Category", "Number_Tested", "Mean_Scale_Score", "level1", "level1_per", "level2", "level2_per", "level3", "level3_per", "level4", "level4_per", "level3&4", "level3&4_per")
#data of 2013-2016
ela_all_stu_1316 <- read.csv("ela_all_1316.csv", skip = 6, col.names = column_name)
ela_ethnicity_1316 <- read.csv("ela_ethnicity_1316.csv", skip = 6, col.names = column_name)
ela_gender_1316 <- read.csv("ela_gender_1316.csv", skip = 6, col.names = column_name)
ela_swd_1316 <- read.csv("ela_swd_1316.csv", skip = 6, col.names = column_name)
ela_ell_1316 <- read.csv("ela_ell_1316.csv", skip = 6, col.names = column_name)
ela_all_stu_1316[ela_all_stu_1316 == "s"] = NA
ela_ethnicity_1316[ela_ethnicity_1316 == "s"] = NA
ela_gender_1316[ela_gender_1316 == "s"] = NA
ela_swd_1316[ela_swd_1316 == "s"] = NA
ela_ell_1316[ela_ell_1316 == "s"] = NA
ela_all_stu_1316$Mean_Scale_Score = as.numeric(as.character(ela_all_stu_1316$Mean_Scale_Score))
ela_ethnicity_1316$Mean_Scale_Score = as.numeric(as.character(ela_ethnicity_1316$Mean_Scale_Score))
ela_gender_1316$Mean_Scale_Score = as.numeric(as.character(ela_gender_1316$Mean_Scale_Score))
ela_swd_1316$Mean_Scale_Score = as.numeric(as.character(ela_swd_1316$Mean_Scale_Score))
ela_ell_1316$Mean_Scale_Score = as.numeric(as.character(ela_ell_1316$Mean_Scale_Score))
ela_ell_1316$level1 = as.numeric(as.character(ela_ell_1316$level1))
ela_ell_1316$level2 = as.numeric(as.character(ela_ell_1316$level2))
ela_ell_1316$level3 = as.numeric(as.character(ela_ell_1316$level3))
ela_ell_1316$level4 = as.numeric(as.character(ela_ell_1316$level4))
#data of 2006-2012
ela_all_stu_0612 <- read.csv("ela_all_0612.csv", skip = 6, col.names = column_name)
ela_ethnicity_0612 <- read.csv("ela_ethnicity_0612.csv", skip = 6, col.names = column_name)
ela_gender_0612 <- read.csv("ela_gender_0612.csv", skip = 6, col.names = column_name)
ela_swd_0612 <- read.csv("ela_swd_0612.csv", skip = 6, col.names = column_name)
ela_ell_0612 <- read.csv("ela_ell_0612.csv", skip = 6, col.names = column_name)
ela_all_stu_0612[ela_all_stu_0612 == "s"] = NA
ela_ethnicity_0612[ela_ethnicity_0612 == "s"] = NA
ela_gender_0612[ela_gender_0612 == "s"] = NA
ela_swd_0612[ela_swd_0612 == "s"] = NA
ela_ell_0612[ela_ell_0612 == "s"] = NA
ela_all_stu_0612$Mean_Scale_Score = as.numeric(as.character(ela_all_stu_0612$Mean_Scale_Score))
ela_ethnicity_0612$Mean_Scale_Score = as.numeric(as.character(ela_ethnicity_0612$Mean_Scale_Score))
ela_gender_0612$Mean_Scale_Score = as.numeric(as.character(ela_gender_0612$Mean_Scale_Score))
ela_swd_0612$Mean_Scale_Score = as.numeric(as.character(ela_swd_0612$Mean_Scale_Score))
ela_ell_0612$Mean_Scale_Score = as.numeric(as.character(ela_ell_0612$Mean_Scale_Score))
Since ELA data had the same data structure as Math data, here we did the similar data processing as for Math data described before. I removed useless rows and set column names when inputing thr data, and then set “s” as NAs. Finally we adjusted the type of data for preparation for data analyses.
a) Heatmap on Math and ELA score by district 2006 ~ 2016
# math scores on all student
# The heatmap with district, year, and fill mean score for math on year 2006 to year 2016
math_allstu = rbind(math_all_stu_0612, math_all_stu_1316)
ggplot(math_allstu, aes(x = Year, y = District, fill = Mean_Scale_Score)) +
geom_tile() +
scale_fill_viridis() +
scale_x_continuous(breaks = seq(2006, 2016, 1)) +
scale_y_continuous(breaks = seq(1, 32, 1)) +
coord_flip() +
labs(title = "All student in grades 3 ~ 8 Math Mean Score 2006 ~ 2016", x = "Year", y = "District")
The reason we used heatmap here was that heatmap could explicitly show the difference between years 2006 ~ 2012 and years 2013 to 2016. Because the data came from two datasets, we used “rbind” to bind them together to gain general information of 3~8 grade student math scores by district from 2006 ~ 2016.
This plot clearly indicates that students’ math scores significantly decreased after year 2012. We find the information online (http://www.nydailynews.com/new-york/education/city-students-scores-dramatic-plunge-new-standardized-tests-article-1.1419973 ) to explain this phenomenon. " Only 31% of New York State students in grades 3 to 8 passed the 2013 math and reading tests, down from 55% in English and 65% in math in 2012 on easier tests." This report reported that the sudden drop on math score did not reflect a drop in performance but rather a raising of standards on exams. In fact, “starting in 2013, the NY State Education Department (NYSED) changed the exams to be Common Core aligned.”(citing from dataset introduction) This fact has verified our data exploration on math mean scores from year 2006 to 2016.
# ELA(English Language Arts) scores on all student
# The heatmap with district, year, and fill mean score for math on year 2006 to year 2016
ela_allstu = rbind(ela_all_stu_0612, ela_all_stu_1316)
ggplot(ela_allstu, aes(x = Year, y = District, fill = Mean_Scale_Score)) +
geom_tile() +
scale_fill_viridis() +
scale_x_continuous(breaks = seq(2006, 2016, 1)) +
scale_y_continuous(breaks = seq(1, 32, 1)) +
coord_flip() +
labs(title = "All student in grades 3 ~ 8 ELA Mean Score 2006 ~ 2016", x = "Year", y = "District")
This plot indicates that the raising standards of exams led to much lower average ELA score as well. We could see both math and ELA (English Language Arts) score decrease by the similar scale.
(2) Comparison of scores with district by 2006 ~ 2012 and 2013 ~ 2016
# Math
# Seperate Math scores by year 2006 ~ 2012 and 2013 ~ 2016
p0612 = ggplot(math_all_stu_0612, aes(x = Year, y = District, fill = Mean_Scale_Score)) +
geom_tile() +
scale_fill_viridis() +
scale_x_continuous(breaks = seq(2006, 2016, 1)) +
scale_y_continuous(breaks = seq(1, 32, 1)) +
coord_flip() +
labs(title = "All student Math Mean Score by District 2006 ~ 2012", x = "Year", y = "District")
p1316 = ggplot(math_all_stu_1316, aes(x = Year, y = District, fill = Mean_Scale_Score)) +
geom_tile() +
scale_fill_viridis() +
scale_x_continuous(breaks = seq(2006, 2016, 1)) +
scale_y_continuous(breaks = seq(1, 32, 1)) +
coord_flip() +
labs(title = "All student Math Mean Score by District 2013 ~ 2016", x = "Year", y = "District")
grid.arrange(p0612, p1316, nrow = 2)
Here we also used heatmap to clearly show the performance differences within and across year groups and compare students’ performance by different districts.
According to two graphs above, we can indicate that: 1) As time went by, students’ performances on math exam became better than before. 2) As the difficulty of math exam increased from year 2006 ~ 2012 to 2013 ~ 2016, the performance differences we larger than before. 3) The prominent two district were 26 and 2, which might correspond to some social factors, such as the income, which would be discussed in the final part of Main Analysis.
# ELA
# Seperate ELA scores by year 2006 ~ 2012 and 2013 ~ 2016
ela_p0612 = ggplot(ela_all_stu_0612, aes(x = Year, y = District, fill = Mean_Scale_Score)) +
geom_tile() +
scale_fill_viridis() +
scale_x_continuous(breaks = seq(2006, 2016, 1)) +
scale_y_continuous(breaks = seq(1, 32, 1)) +
coord_flip() +
labs(title = "All student ELA Mean Score by District 2006 ~ 2012", x = "Year", y = "District")
ela_p1316 = ggplot(ela_all_stu_1316, aes(x = Year, y = District, fill = Mean_Scale_Score)) +
geom_tile() +
scale_fill_viridis() +
scale_x_continuous(breaks = seq(2006, 2016, 1)) +
scale_y_continuous(breaks = seq(1, 32, 1)) +
coord_flip() +
labs(title = "All student ELA Mean Score by District 2013 ~ 2016", x = "Year", y = "District")
grid.arrange(ela_p0612, ela_p1316, nrow = 2)
By this plot, we found that heatmap of Math and ELA performances were similar. In general, student mean score increased as time went by, yet ELA had expectations on year 2011 and 2014. In addition, compared with year group 2006 ~ 2012, the range of students’ mean scores on ELA enlarged as mean scores on Math. And the districts with higher average Math scores corresponded to the districts with higher average ELA scores. Thus, we referred that there might be some relationship between Math exam performances and ELA exam performances. So, we did two covariance matrices to indicate the relationship between Math and ELA.
(3) Covariance Matrices on Math and ELA exam performances
library(plyr)
math_all0612 <- math_all_stu_0612 %>%
mutate(year_cat = "06-12")
math_all1316 <- math_all_stu_1316 %>%
mutate(year_cat = "13-16")
ela_all0612 <- ela_all_stu_0612 %>%
mutate(year_cat = "06-12")
ela_all1316 <- ela_all_stu_1316 %>%
mutate(year_cat = "13-16")
math_all = rbind(math_all0612, math_all1316)
ela_all = rbind(ela_all0612, ela_all1316)
math_allstu_rn <- rename(math_all, c("District" = "math_district", "Grade" = "math_grade", "Year" = "math_year", "Category" = "math_category", "Number_Tested" = "math_num_test", "Mean_Scale_Score" = "math_mean", "level1" = "math_level1", "level1_per" = "math_level1_per", "level2" = "math_level2", "level2_per" = "math_level2_per", "level3" = "math_level3", "level3_per" = "math_level3_per", "level4" = "math_level4", "level4_per" = "math_level4_per", "level3.4" = "math_level3.4", "level3.4_per" = "math_level3.4_per", "year_cat" = "math_year_cat"))
ela_allstu_rn <- rename(ela_all, c("District" = "ela_district", "Grade" = "ela_grade", "Year" = "ela_year", "Category" = "ela_category", "Number_Tested" = "ela_num_test", "Mean_Scale_Score" = "ela_mean", "level1" = "ela_level1", "level1_per" = "ela_level1_per", "level2" = "ela_level2", "level2_per" = "ela_level2_per", "level3" = "ela_level3", "level3_per" = "ela_level3_per", "level4" = "ela_level4", "level4_per" = "ela_level4_per", "level3.4" = "ela_level3.4", "level3.4_per" = "ela_level3.4_per", "year_cat" = "ela_year_cat"))
math_ela <- cbind(math_allstu_rn, ela_allstu_rn)
math_ela <- cbind("id" = rownames(math_ela), math_ela)
df1 <- select(math_ela,id,starts_with("m"))
df2 <- select(math_ela,id,starts_with("e"))
# math and ela comparison
mmean <- df1 %>%
select(id, math_district, math_mean, math_year_cat) %>%
gather(math_var,math_mean,-id, -math_district, -math_year_cat) %>%
rename(., c("math_district" = "District", "math_mean" = "value", "math_year_cat" = "year_cat"))
mlevel1_per <- df1 %>%
select(id,math_district, math_level1_per, math_year_cat) %>%
gather(math_var,math_level1_per,-id, -math_district, -math_year_cat) %>%
rename(., c("math_district" = "District", "math_level1_per" = "value", "math_year_cat" = "year_cat"))
mlevel2_per <- df1 %>%
select(id, math_district, math_level2_per, math_year_cat) %>%
gather(math_var,math_level2_per,-id, -math_district, -math_year_cat) %>%
rename(., c("math_district" = "District", "math_level2_per" = "value", "math_year_cat" = "year_cat"))
mlevel3_per <- df1 %>%
select(id, math_district, math_level3_per, math_year_cat) %>%
gather(math_var,math_level3_per,-id, -math_district, -math_year_cat) %>%
rename(., c("math_district" = "District", "math_level3_per" = "value", "math_year_cat" = "year_cat"))
mlevel4_per <- df1 %>%
select(id, math_district, math_level4_per, math_year_cat) %>%
gather(math_var,math_level4_per,-id, -math_district, -math_year_cat) %>%
rename(., c("math_district" = "District", "math_level4_per" = "value", "math_year_cat" = "year_cat"))
dftm <- rbind(mlevel1_per, mlevel2_per) %>%
rbind(., mlevel3_per) %>%
rbind(., mlevel4_per)
emean <- df2 %>%
select(id, ela_district, ela_mean, ela_year_cat) %>%
gather(ela_var,ela_mean,-id, -ela_district, -ela_year_cat) %>%
rename(., c("ela_district" = "District", "ela_mean" = "value", "ela_year_cat" = "year_cat"))
elevel1_per <- df2 %>%
select(id,ela_district, ela_level1_per, ela_year_cat) %>%
gather(ela_var,ela_level1_per,-id, -ela_district, -ela_year_cat) %>%
rename(., c("ela_district" = "District", "ela_level1_per" = "value", "ela_year_cat" = "year_cat"))
elevel2_per <- df2 %>%
select(id,ela_district, ela_level2_per, ela_year_cat) %>%
gather(ela_var,ela_level2_per,-id, -ela_district, -ela_year_cat) %>%
rename(., c("ela_district" = "District", "ela_level2_per" = "value", "ela_year_cat" = "year_cat"))
elevel3_per <- df2 %>%
select(id,ela_district, ela_level3_per, ela_year_cat) %>%
gather(ela_var,ela_level3_per,-id, -ela_district, -ela_year_cat) %>%
rename(., c("ela_district" = "District", "ela_level3_per" = "value", "ela_year_cat" = "year_cat"))
elevel4_per <- df2 %>%
select(id,ela_district, ela_level4_per, ela_year_cat) %>%
gather(ela_var,ela_level4_per,-id, -ela_district, -ela_year_cat) %>%
rename(., c("ela_district" = "District", "ela_level4_per" = "value", "ela_year_cat" = "year_cat"))
dfte <-rbind(elevel1_per, elevel2_per) %>%
rbind(., elevel3_per) %>%
rbind(., elevel4_per)
dft <- dftm %>%
left_join(dfte, by="id")
df_mean <- mmean %>%
left_join(emean, by = "id")
ggplot(data = dft,aes(x = value.x, y = value.y)) +
geom_point(aes(col = year_cat.x), alpha = 0.05) +
geom_smooth(aes(linetype = year_cat.x), fill = NA, size = 0.5) +
facet_grid(math_var ~ ela_var) +
labs(title = "Covariance Matrix on Math and ELA scores on grades 3 ~ 8 2006 ~ 2012 by levels (1)", x = "ELA Performane Levels Proportion", y = "Math Performance Levels Proportion", linetype = "Year Group Line", col = "Year Group Point")
Because of the similar looks on heatmaps of Math and ELA, we wanted to validate a hypothesis that student with higher scores on Math or ELA performs better in another subject as well in general. Thus, we chose Covariance Matrix to indicate the possible relationship.
However, because the function “pairs” and “plot” could not clearly indicate the trends by different categories, we did a much complex procedure to draw the covairance matrix plot. The challege we met was combining the useful information into one dataframe and then we could use ggplot2 to plot what we wanted. We use package “dplyr” to select and gather the data. Finally we could figure out the trend with ELA performances and Math performances by year group, which avoided errors caused by different exam standards. And the reason why we used feature “levels” instead of feature “scores” here was also the same.
According to the graph above, especially the plots in diagonal, we could indicate that the porprotion of students in levelN in Math exams were almost linearly related to the one of students in levelN, which might show that students with higher Math scores might have higher ELA score as well. Obviously, This graph validated our hypothesis.
# Mean covariance
ggplot(data = df_mean,aes(x = value.x, y = value.y)) +
geom_point(aes(col = year_cat.x), alpha = 0.2) +
geom_smooth(aes(linetype = year_cat.x), col = "black", size = 0.5) +
facet_grid(math_var ~ ela_var) +
labs(title = "Covariance Matrix on Math and ELA scores on grades 3 ~ 8 2006 ~ 2012 by Mean Score", x = "ELA score", y = "Math score")
Here we used mean scores to indicate the general relationship between Ela and Math by year categories. We could find that in 2006 ~ 2016, all students’ Math and ELA mean scores had positive correlation.
ggplot(data = dft,aes(x = value.x, y = value.y)) +
geom_smooth(aes(col = as.factor(District.x), linetype = year_cat.x), fill = NA, size = 0.5) +
facet_grid(math_var ~ ela_var) +
guides(col = FALSE) +
labs(title = "Covariance Matrix on Math and ELA scores on grades 3 ~ 8 2006 ~ 2012 by levels (2)", x = "ELA Performane Levels Proportion", y = "Math Performance Levels Proportion", linetype = "Year Group")
And then, we also validated the relationship between students’ Math and ELA performances by districts. Although we could not figure out the relationship for one specific district, we still could figure out the relationship in general. Clearly, for students in different districts, the positive correlation between Math and ELA scores still existed.
b) Math and ELA Score Analysis by Ethnicity
(i) Histogram on Mean score
#math_ethnicity
#histogram
math_phist1 <- math_ethnicity_1316 %>%
group_by(Category) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(Subject = "Math")
math_phist2 <- math_ethnicity_0612 %>%
group_by(Category) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(Subject = "Math")
ela_phist1 <- ela_ethnicity_1316 %>%
group_by(Category) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(Subject = "ELA")
ela_phist2 <- ela_ethnicity_0612 %>%
group_by(Category) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(Subject = "ELA")
pphist0612 <- rbind(math_phist2, ela_phist2)
pphist1316 <- rbind(math_phist1, ela_phist1)
subject_hist0612 <- ggplot(pphist0612, aes(x = Category, y = mean_score)) +
geom_bar(stat = "identity", position = "dodge", aes(fill = Subject)) +
labs(title = "2006 ~2012 Mean Score with Ethnicity", x = "Ethnicity", y = "Mean Score", fill = "Year Group")
subject_hist1316 <- ggplot(pphist1316, aes(x = Category, y = mean_score)) +
geom_bar(stat = "identity", position = "dodge", aes(fill = Subject))+
labs(title = "2013 ~2016 Mean Score with Ethnicity", x = "Ethnicity", y = "Mean Score", fill = "Year Group")
grid.arrange(subject_hist0612, subject_hist1316, nrow = 1)
We chose bar chart with “dodge” here because it can show ELA and Math scores together and their heights, which was convenient for readers to compare. We tried to put performances with different Ethnicity during the period 2006 ~ 2016, but that method made the differences between ethnicities unclear.
According to the graphs above, we could figure out that Asians outperformed in math and ELA exams than other ethnicities and Whites performed slightly better than other two ethnicities. And in general, Hispanics performed a bit better than Blacks. In addition, student’s math scores were higher than ELA scores in general.
(ii) Line chart on Mean score by district
# Math pcp plot
pcp_math_eth <- math_ethnicity_1316 %>%
group_by(Category, District) %>%
dplyr::summarise(mean_score=mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(year_cat = "13-16")
pcp_math_eth2 <- math_ethnicity_0612 %>%
group_by(Category, District) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(year_cat = "06-12")
pcp_math <- rbind(pcp_math_eth, pcp_math_eth2)
ggplot(pcp_math, aes(x = District, y = mean_score, col = Category)) +
geom_line(aes(linetype = year_cat)) +
scale_x_continuous(breaks = seq(1, 32, 1)) +
labs(title = "Student Math Mean Score with Ethnicity and District", x = "District", y = "Mean Score", col = "Ethnicity", linetype = "Year Group")
Because this plot is easy to figure out and compare the trends compared to a bar chart, we chose line chart to indicate the overall trends by the district through different ethnicities.
According to the plot above, we found that Asians’ math scores were quite stable by district and Whites’ math scores were various through districts. The parallel coordinates plot above clearly showed that the shapes of lines by two different groups were almost the same, which indicated that students’ academic performances stayed stable with time through different districts. Moreover, what was surprised was that at District 9, 16, 17, 18, 19 and 23, Whites performed even worse than Hispanics and Blacks. There might be some reason for this phenomenon.
#ela_ethnicity 2013 ~2016
#pcp plot
pcp_ela_eth <- ela_ethnicity_1316 %>%
group_by(Category, District) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(year_cat = "13-16")
pcp_ela_eth2 <- ela_ethnicity_0612 %>%
group_by(Category, District) %>%
dplyr::summarise(mean_score = mean(Mean_Scale_Score, na.rm = TRUE)) %>%
mutate(year_cat = "06-12")
pcp_ela <- rbind(pcp_ela_eth, pcp_ela_eth2)
ggplot(pcp_ela, aes(x = District, y = mean_score, col = Category)) +
geom_line(aes(linetype = year_cat)) +
scale_x_continuous(breaks = seq(1, 32, 1)) +
labs(title = "Student ELA Mean Score with Ethnicity and District", x = "District", y = "Reletive value", col = "Ethnicity", linetype = "Year Group")
The parallel coordinates plot above clearly showed that the shapes of lines by two different groups were almost the same, which indicated that students’ academic performances stayed stable with time through different districts. Moreover, what shocked us was that at District 7, 9, 16, 17, 18, 19 and 23 in the year 2013 to 2016, Whites performed even worse than Hispanics and Blacks, which appeared the same phenomenon in Math.
c) Math Score Analysis by Gender
(i) Line chart math score analysis by gender, grade and year group
#math_gender
#pcp math score by gender, grade and year group
pcp_gender0612 <- math_gender_0612 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1)/sum(Number_Tested), level2_proportion = sum(level2)/sum(Number_Tested), level3_proportion = sum(level3)/sum(Number_Tested), level4_proportion = sum(level4)/sum(Number_Tested)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "06-12")
pcp_gender1316 <- math_gender_1316 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1)/sum(Number_Tested), level2_proportion = sum(level2)/sum(Number_Tested), level3_proportion = sum(level3)/sum(Number_Tested), level4_proportion = sum(level4)/sum(Number_Tested)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "13-16")
pcp_gender = rbind(pcp_gender0612, pcp_gender1316)
ggplot(pcp_gender, aes(x = as.numeric(as.factor(Levels)), y = Proportion, col = Grade, alpha = year_cat)) +
geom_line(aes(linetype = Category)) +
scale_alpha_discrete(range = c(0.4, 1)) +
labs(title = "Student Math Score Levels Proportion w.r.t Gender, Grade and Year group", x = "Math Score Levels", y = "Proportion", linetype = "Gender", alpha ="Year Group", col = "Grade")
Here we also chose line chart to indicate the overall trends of performances on levels by different grade, gender, and year group. This plot could clearly compare these three categories within one plot, which was more informative than single bar chart for readers.
According to the plot above, we could figure out some information by three dimensions with levels definition.
Levels Definition: NYS Level 1: Students performing at this level are well below proficient in standards for their grade. NYS Level 2: Students performing at this level are partially proficient in standards for their grade. NYS Level 3: Students performing at this level are proficient in standards for their grade. NYS Level 4: Students performing at this level excel in standards for their grade.
Gender(line and dot line): We could indicate that females performed slightly better than males in math exams, in that the proportions of females in level3 and level4 were a little bit larger than of males, while the proportions of males in level1 and level2 were a bit larger than of females.
Year group (transparency in 0.4 and 1): We could clearly observe that in year group 06-12, there was the largest proportion on level3 and became less and less on level2 then level1. And around 25% students were in level4 with excellent performance. However, because of the raising standard in exams, in the year group 13-16, the proportion of students’ math performances was inversely proportional to levels. This plot coincided with our former analysis in all student math score in 2006~2012 and 2013~2016 that the differences on students’ math exam performances were greater as the degree of difficulty of math exam increased.
Grade(color of lines): In the condition of the higher standard on math exams(years 2013 to 2016), the higher grade students were in, the greater score difference would be. For example, students in grade8 had larger proportion on level1 and level4 than grade5. In the condition of easier difficulty on math exams (years 2006 to 2013), the lower grade students were in, the greater score difference would be. For instance, students in grade 3 make up more than grade8 in level3 and less in level1 and level2.
# ELA analysis on gender
#pcp ELA score by gender, grade and year group
ela_pcp_gender0612 <- ela_gender_0612 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1)/sum(Number_Tested), level2_proportion = sum(level2)/sum(Number_Tested), level3_proportion = sum(level3)/sum(Number_Tested), level4_proportion = sum(level4)/sum(Number_Tested)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "06-12")
ela_pcp_gender1316 <- ela_gender_1316 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1)/sum(Number_Tested), level2_proportion = sum(level2)/sum(Number_Tested), level3_proportion = sum(level3)/sum(Number_Tested), level4_proportion = sum(level4)/sum(Number_Tested)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "13-16")
ela_pcp_gender = rbind(ela_pcp_gender0612, ela_pcp_gender1316)
ggplot(ela_pcp_gender, aes(x = as.numeric(as.factor(Levels)), y = Proportion, col = Grade, alpha = year_cat)) +
geom_line(aes(linetype = Category)) +
scale_alpha_discrete(range = c(0.4, 1)) +
labs(title = "Student ELA Score Levels Proportion w.r.t Gender, Grade and Year group", x = "ELA Score Levels", y = "Proportion", linetype = "Gender", alpha ="Year Group", col = "Grade")
According to the plot above, we could figure out some information by three dimensions with levels definition mentioned above.
Gender(line and dot line): We could indicate that females performed better than males in ELA exams with greater difference than in Math exams, in that the proportions of females in level3 and level4 were a little bit larger than of males, while the proportions of males in level1 and level2 were a bit larger than of females.
Year group(transparency in 0.4 and 1): We could clearly observe that in year group 06-12, there was the largest proportion on level3 for females and the largest proportion on level2 for males. And less than 10% students were in level4 with excellent performance. However, because of the raising standard in exams, from level2 to level4 in the year group 13-16, the proportion of students’ math performances was inversely proportional to level. What surprised us was that although the standard raised and mean scores decreased sharply, students in level4 in year group 2013 ~ 2016 were more than in year group 2006 ~ 2012.
Grade(color of lines): For ELA grade, it seems that students’ grades were not related to ELA scores.
(ii) Gender comparison on proportion in stacks plot
# Math
#gender comparison with stacks
ggplot(pcp_gender, aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(title = "Student Math Score Levels by gender comparison", x = "Gender", y = "Proportion", fill = "Levels") +
theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE))
Because the total proportions of students in different levels were 100% so we chose stacked bar charts to analyze the performance differences between Males and Females.
According to the graph above, we could figure out that the performances of females and males were almost the same, and females’ performances were slightly better than males’. This was because the proportion line of males’ performance was a little bit higher than females’, corresponding to more lower-level performances by males.
# ela
#gender comparison with stacks
ggplot(ela_pcp_gender, aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(title = "Student ELA Score Levels by gender comparison", x = "Gender", y = "Proportion", fill = "Levels") +
theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE))
According to the graph above, we could find that females performed better than males in all grades, especially when the exam standards raised. Moreover, this plot also corresponded to our former conclusion that students in different grades had similar performances in ELA exams. And with higher standards, students performed better in excellent (level4), much less in good (level3) and increasing amount in bad (level1).
d) Math Score Analysis by Disability Status
Analysis with grades, year group, and SWD Category on stack plot
#math SWD(student with disability)
plot_swd0612 <- math_swd_0612 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "06-12")
plot_swd1316 <- math_swd_1316 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "13-16")
plot_swd = rbind(plot_swd0612, plot_swd1316)
ggplot(plot_swd, aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(title = "Math Score Levels by Disability or Not", x = "Disability Status", y = "Proportion", fill = "Levels") +
theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE))
We also used stacked bar charts for SWD analyses as well. The comparison was very apparent by stack bar charts.
Obviously, students with a disability (SWD) performed much worse than students without a disability. Moreover, in general, the higher grade the students were in, the worse the performances of students both in SWD and notSWD groups.
#math ELA(student with disability)
ela_plot_swd0612 <- ela_swd_0612 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "06-12")
ela_plot_swd1316 <- ela_swd_1316 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "13-16")
ela_plot_swd = rbind(ela_plot_swd0612, ela_plot_swd1316)
ggplot(ela_plot_swd, aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(title = "ELA Score Levels by Disability or Not", x = "Disability Status", y = "Proportion", fill = "Levels") +
theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE))
Like the phenomenon in Math, students with a disability(SWD) performed much worse than students without a disability. Moreover, whether students were disabled or not, there were fewer students in level4 of ELA than of Math. And students with a disability accounted for the higher proportion of level1 in ELA exams than in Math exams.
e) Math Score Analysis by English Proficient Status
Analysis with grades, year group, and ELL Category on stack plot
#math Ell(English learner)
plot_ell0612 <- math_ell_0612 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "06-12") %>%
ggplot(aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(title = "Math Score Levels with respect to English Proficient Status", x = "English Proficient Status", y = "Proportion", fill = "Levels") +
#theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE))
plot_ell1316 <- math_ell_1316 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "13-16") %>%
ggplot(aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(x = "English Proficient Status", y = "Proportion", fill = "Levels") +
#theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
grid.arrange(plot_ell0612, plot_ell1316, nrow = 2)
Finally, for ELL analyses, we took stacked bar charts because it was the clearest way to compare within our dataset.
According to the plot above, we could indicate the relationship between English Proficient Status and students’ performance level.
The definition of ELL, EP, Former ELL: ELL: English-Language Learner. “An English language learner (often capitalized as English Language Learner or abbreviated to ELL) is a person who is learning the English language in addition to his or her native language.” (Wikipedia) EP: People with English Proficient are those who take English as mother tongue. Former ELL: Former English Language Learners. “Former English Language Learners includes any students who were classified as English Language Learners in at least one of the previous two school years.” (Official Notes for this dataset)
We could find that students in ELL group performed much worse than EP groups through years 2006 to 2016. In years 2013 to 2016, official education department added a new class for English Proficient Status as “Former ELL”. We could find that except for grade3, students in Former ELL group performed better than ELL and slightly worse than EP, which might indicate that better English Language ability improved students’ understanding of math.
# ELA Ell(English learner)
ela_plot_ell0612 <- ela_ell_0612 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "06-12") %>%
ggplot(aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(title = "ELA Score Levels with respect to English Proficient Status", x = "English Proficient Status", y = "Proportion", fill = "Levels") +
#theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE))
ela_plot_ell1316 <- ela_ell_1316 %>%
group_by(Grade, Category) %>%
dplyr::summarise(level1_proportion = sum(level1, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level2_proportion = sum(level2, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level3_proportion = sum(level3, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE), level4_proportion = sum(level4, na.rm = TRUE)/sum(Number_Tested, na.rm = TRUE)) %>%
gather(Levels, Proportion, -Grade, -Category) %>%
mutate(year_cat = "13-16") %>%
ggplot(aes(x = Category, y = Proportion, fill = factor(Levels, levels = c("level4_proportion", "level3_proportion", "level2_proportion", "level1_proportion")))) +
geom_bar(stat = "identity") +
facet_grid(year_cat ~ Grade) +
labs(x = "English Proficient Status", y = "Proportion", fill = "Levels") +
#theme(legend.position = "bottom") +
guides(fill=guide_legend(reverse=TRUE)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
grid.arrange(ela_plot_ell0612, ela_plot_ell1316, nrow = 2)
According to the plot above, we could indicate that the proficiency of English had a larger impact on ELA scores than Math scores. Students in ELL category performed much worse than students in EP and former ELL categories. However, according to the definition of “Former Ell”, we inferred that students could quickly adapt to English study environment as a former ELL in one to two years, and improve their academic performances.
(3) High School Student Performances Analysis
a) Overall Graduation Rate among Districts
col_names<-c("District","Category", "Cohort_Year", "Cohort", "num_cohort", "num_grads", "per_grads_cohort", "num_regents", "per_regents_cohort", "per_regents_grads", "num_adregents", "per_adregents_cohort", "per_adregents_grads" , "num_wadregents", "per_wadregents_cohort", "per_wadregents_grads","num_local","per_local_cohort","per_local_grads","num_enrolled","per_enrolled_cohort","num_dropout","per_dropout_cohort","num_SACC","per_SACC_cohort","num_TASC","per_TASG_cohort")
graduate<-read.csv("2016 Graduation_Rates_Public_District_ALL.csv",skip=8,col.names =col_names)
# remove unecessary features
graduate<-graduate[-2]
cols<-col_names[6:27]
# standardize missing data
graduate[graduate == "s"] = NA
# transform the fator into numeric
graduate[,cols] <- apply(graduate[,cols], 2, function(x) as.numeric(x))
First and foremost, we had to do data cleaning and preprocessing to the data set in order to analyze the data. As the format of the data frame was a little messy when it was first imported to RStudio, we got to rearrange the data set related to the graduation information, such as renaming the column names, removing the unnecessary features, standardizing missing data and transforming the data type of the graduation indicators from factor into numeric.
##remove the missing data in per_grads_cohort
graduate <- graduate[complete.cases(graduate),]
graduate <- graduate%>% filter(District!=79)
graduate_per<- graduate%>%
group_by(District, Cohort_Year) %>%
# dplyr::summarise(num_grads= sum(num_cohort))
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort))
From the missing data graph, we found out that a few observations are missing on all included variables. Often, this indicates a more complicated model is needed for this missingness mechanism, but we decided to remove all the missing observations since the number of missing observations was rather small compared to the total data set.
Besides, we found out there was an extra district 79, New York City’s Alternative Schools District, included in the data set. Since it was not the focus of our research, we decided to drop the observations of District 79 to avoid the influence.
(i) Heatmap of the Graduation Rate among Districts
graduate_per$District <- factor(graduate_per$District)
ggplot(graduate_per, aes(Cohort_Year, District, fill = sum_per_grad)) +
geom_tile() +
# geom_text(aes(label =round(sum_per_grad,3)), color = "white",size=3) +
scale_x_continuous(breaks = seq(2001, 2012, 1))+
theme(axis.text.x = element_text(angle=10))+
scale_fill_viridis()+
labs(title = "The graduation rate of different districts from 2001 to 2012",x="Cohort Year", fill="Graduation Rate")
After the process of data cleansing, we attempted to demonstrate the graduation rates of all students among the 32 districts. According to the dataset, the cohort consists of all students who first entered 9th grade in a given school year (e.g. the Cohort of 2006 entered 9th grade in the 2006-2007 school year). Graduates are defined as those students earning either a Local or Regents diploma. From the heatmap, we found out that the two districts with the best graduation rate were District 26 and District 13, and the districts with the decent graduation rates were 28, 22 and 4. On the other hand, the graduation rate of District 23, 16 and 8 was the lowest, especially District 23 where the rate was decreasing after 2004 while the rate of other districts was mostly gradually raising.
As we have known, District 23 is one of the smallest in the city, which serves Ocean Hill, Brownsville, and parts of East New York, and includes some of the city’s poorest neighborhoods. Single-family homes are interspersed with old brownstones, apartment buildings, large public housing projects and homeless shelters. The graduation rate shown by the graph seems to match with the education condition of the districts.
From the website (http://insideschools.org/districts/brooklyn/district-23), the schools, among the lowest-performing in the city, have been losing population, partly as a result of competition from charter schools. The Community Education Council voted in 2013 to do away with zoned schools, allowing parents to apply to any school in the district. However, parents complained that there were few viable choices.
(ii) Indicators of Districts with the Best and Worst Overall Graduation Rate
graduate_tidy<- graduate%>%
group_by(District,Cohort_Year) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_adregent= sum(num_adregents) /sum(num_cohort),
sum_per_wadregent= sum(num_wadregents)/sum(num_cohort),
sum_per_local = sum(num_local)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort),
sum_per_tasc= sum(num_TASC)/sum(num_cohort))
graduate_tidy$District <- factor(graduate_tidy$District)
orderclass <- graduate%>%
group_by(District) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort))
orderclass_best <- orderclass [order(-orderclass$sum_per_grad),][1:3,]
orderclass_worst <- orderclass [order(orderclass$sum_per_grad),][1:3,]
graduate_tidy1 <- within(graduate_tidy,{
cat<-NA
cat[District %in% orderclass_best$District ]<-"best"
cat[District %in% orderclass_worst$District ]<-"worst"
cat[is.na(cat)]<-"normal"
}
)
graduate_tidy1<-graduate_tidy1%>%filter(cat !='normal')
graduate_tidy1$cat <- factor(graduate_tidy1$cat, labels = c("best","worst"))
After data preprocessing, there was another challenge of data processing we faced. We attempted to demonstrate the overall graduation rates of all districts over the years, but the number of districts was a bit too large for presentation. Thus, we tried to select the three districts with the best graduation rate and three with the worst one.
To attain this goal, we created a data frame called “orderclass” which contains the overall graduation rate grouped by district, then derive the three best and three worst records from that in order to add a new feature “cat” to the processed dataset with the purpose of indicating the category of the districts set up by ourselves.
graduate_tidy1$District<-factor(graduate_tidy1$District,levels=c(26,23,13,16,28,18))
to_string<-as_labeller(c('1'='district26','2'='district23','3'='district13','4'='district16','5'='district28','6'='district18'))
glabel<-c('Graduation Rate','Dropout Rate','Regent Rate','Advanced Regent Rate','Without Advanced Regent Rate','Local Rate','SACC Rate','TASC Rate')
ggparcoord(graduate_tidy1, columns =3:10,scale="globalminmax",groupColumn = "cat",alphaLines = 0.3)+theme(axis.text.x = element_text(size=10,angle=20))+
scale_x_discrete(labels=glabel)+
labs(title="Indicators of Six Districts with the Best and Worst Overall Graduation Rate ",col='Best/Worst') +
facet_wrap(~District,scales='fixed',nrow=3,labeller=to_string)
We applied parallel coordinate to analyze the patterns of graduation indicators on districts with the best and the worst overall graduation rate. The multiple lines for each grid refer to the rates for each year.
For the districts with the best graduation rates, we found out that there was a similar pattern shared by the districts – they all got a relatively low dropout rate, low Local diploma percentage, low SACC percentage, low TASC percentage and high Regent percentage. Especially for District 26 and 13, they got a high rate of Advanced Regent Diploma and low rate of Regent without Advanced over the years.
For the three districts with the worst graduation rates, the dropout rate is relatively high and the Advanced Regent rate is relatively low. There were fluctuations existent over the indicators Regent rate, the rate of Regent without Advanced and Local rate among the years, especially for District 18, where we could see there were almost two patterns over the mentioned three indicators.
We would explain the meaning of the indicators related to graduation. New York State has selected a new high school equivalency test called the Test Assessing Secondary Completion (TASC) to replace the General Educational Development (GED) as the primary pathway to a New York State High School Equivalency Diploma effective January 2, 2014. The TASC is a secure, reliable and valid instrument that is used to verify that examinees have knowledge in core content areas equivalent to that of graduating high school seniors.
The Skills & Achievement Commencement Credential (SACC) is a certificate available only to students with severe cognitive disabilities who are eligible to take the NYSAA and have attended school for not less than 12 years, excluding Kindergarten. The Career Development and Occupational Studies (CDOS) Commencement Credential, which indicates that the student has the knowledge and skills necessary for entry-level employment.
In terms of the Local Diploma, all students can earn a Regents or Advanced Regents diploma; however, only students who meet specific criteria are eligible to graduate with a local diploma, which allows students to graduate with lower exam scores. Besides, an Advanced Regents diploma lets students demonstrate additional skills in math, science, and languages other than English.
b) Graduation Rate Analysis by English Level
graduate_ell<-read.csv("2016 Graduation_Rates_Public_District_ELL.csv",skip=8,col.names =col_names)
cols<-col_names[6:27]
graduate_ell[graduate_ell == "s"] = NA
graduate_ell[,cols] <- apply(graduate_ell[,cols], 2, function(x) as.numeric(x))
We imported the dataset of graduation indicators by English level, utilizing the techniques we used before including standardizing missing data and transferring the data type of features.
##remove the missing data in per_grads_cohort
graduate_ell <- graduate_ell[complete.cases(graduate_ell),]
graduate_ell <- graduate_ell%>% filter(District!=79)
We implemented the similar data preprocessing stated as above to prepare the data set for analysis. We figured out that a few observations are missing on all included variables. Therefore, we decided to remove all the missing observations, since the number of missing observations was rather small compared to the total data set.
Besides, we found out there was an extra district 79, New York City’s Alternative Schools District, included in the data set. Since it was not the focus of our research, we decided to drop the observations of District 79 to avoid the influence.
(i) Graduation Rate Analysis by English Level and District
graduate_ell_phist <- graduate_ell %>%
group_by(Category,Cohort_Year) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort))
graduate_ell_per <- graduate_ell %>%
group_by(Category, District) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_adregent= sum(num_adregents) /sum(num_cohort),
sum_per_wadregent= sum(num_wadregents)/sum(num_cohort),
sum_per_local = sum(num_local)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort),
sum_per_tasc= sum(num_TASC)/sum(num_cohort))
phist<- ggplot(graduate_ell_phist, aes(x =factor(Cohort_Year) , y = sum_per_grad, fill =Category)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Graduation Rate with respect to English Level", x = "Year", y = "Graduation Rate")
#grid.arrange(math_hist, math_pcp, nrow = 2)
pline<- ggplot(graduate_ell_per , aes(x = District, y = sum_per_grad, col = Category) )+
geom_line()+
scale_x_continuous(breaks = seq(1, 32, 1)) +
ylim(0,1)+
labs(title = "Graduation Rate with English Level and District", x = "District", y = "Graduation Rate")
grid.arrange(phist, pline, nrow = 2)
From the bar chart, it indicates the graduation rate with respect to English levels during the period from 2001 to 2012. In order to achieve this graph, we summarized the graduation rate grouped by category and cohort year. Over the years, the graduation rate for Former ELL was the best, while the rate for ELL was the worst. Besides, though the rate for English Proficient was in the middle, it was gradually increasing and approaching to the one for Former ELL.
From the line graph, it shows the overall graduation rate of three English levels among the 32 districts. In order to achieve this graph, we summarized the different graduation indicators grouped by category and district. The graduation rate pattern over the districts was shared by the three English levels generally. However, in the District 3, the rate for English Proficient was slightly better than the one for the Former ELL. Among all these districts, the graduation rates for District 13 were the best for all three English categories, while the graduation rates for District 8,16 and 23 were the worst for ELL, English Proficient, and Former ELL. There were also some special cases among districts. For instance, the difference in graduation rates between ELL and English Proficient was the largest in District 6, since the rate of English Proficient was close to that of Former ELL. However, the graduation rates of ELL and English Proficient was pretty close in District 18.
(ii) Graduation Indicators by English Levels
graduate_ell_tidy <- graduate_ell %>%
group_by(Category) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_adregent= sum(num_adregents) /sum(num_cohort),
sum_per_wadregent= sum(num_wadregents)/sum(num_cohort),
sum_per_local = sum(num_local)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort),
sum_per_tasc= sum(num_TASC)/sum(num_cohort))
graduate_ell_tidy<-graduate_ell_tidy%>%gather(Indicator, Percentage, -Category,-sum_per_grad)
graduate_ell_tidy$Indicator<-factor(graduate_ell_tidy$Indicator,levels=c('sum_per_regent','sum_per_adregent','sum_per_wadregent','sum_per_dropout','sum_per_local','sum_per_sacc','sum_per_tasc'))
There was an another challenge we encountered while data analysis. We tried to facet wrap the indicator so as to show the different graduation rates with respect to English levels after summarizing the graduation rates grouped by category. However, the summarized graduation indicators were in wide forms, thus we had to transform them into the long format by using gather function.
to_string<-as_labeller(c('sum_per_adregent'='Advanced Regent Rate','sum_per_dropout'='Dropout Rate','sum_per_local'='Local Rate','sum_per_regent'='Regent Rate','sum_per_sacc'='SACC Rate','sum_per_tasc'='TASC Rate','sum_per_wadregent'='Without Advanced Regent Rate'))
ggplot(graduate_ell_tidy, aes(x = Category, y = Percentage, fill=factor( Category))) +
geom_bar(stat = "identity") +
facet_wrap(~factor(Indicator),labeller=to_string) +
labs(title = "Graduation Indicators by English Level", x = "English Level", y = "Indicator Precentage",fill="Category")+
theme(axis.text.x = element_text(size=10,angle=20))
# theme(legend.position = "bottom")+
# guides(fill=guide_legend(reverse=TRUE))
For dropout rate, ELL was highest, while Former ELL was lowest. For the indicators Regent rate and Without Advanced Regent Rate, Former ELL was the best, then was English Proficient, while ELL was the worst. However, for the Advanced Regent rate, English Proficient was slightly better than the Former ELL. Besides, there were fewer individuals of English Proficient type getting Local Diplomas(allowed to graduate with fewer exam scores)
Therefore, we could conclude that Former ELL performs better for the general graduation rates, while the English Proficient performs better for the advanced levels, which means more students of English Proficient earned high exam scores among the graduate students. And the students as English learners ranked last in various dimensions.
c) Graduation Rate Analysis by Gender
graduate_gen<-read.csv("2016 Graduation_Rates_Public_District_Gender.csv",skip=8,col.names =col_names)
cols<-col_names[6:27]
graduate_gen[graduate_gen == "s"] = NA
graduate_gen[,cols] <- apply(graduate_gen[,cols], 2, function(x) as.numeric(x))
We imported the dataset of graduation indicators by gender, utilizing the techniques we used before including standardizing missing data and transferring the data type of features.
##remove the missing data in per_grads_cohort
graduate_gen <- graduate_gen[complete.cases(graduate_gen),]
graduate_gen <- graduate_gen%>% filter(District!=79)
We implemented the similar data preprocessing stated as above to prepare the data set for analysis. We found out that a few observations were missing on all included variables. Therefore, we decided to remove all the missing observations, since the number of missing observations was rather small compared to the total data set.
Besides, we found out there was an extra district 79, New York City’s Alternative Schools District, included in the data set. Since it was not the focus of our research, we decided to drop the observations of District 79 to avoid the influence.
(i) Graduation Rate Analysis by Gender,District and Year
graduate_gen_phist <- graduate_gen %>%
group_by(Category,Cohort_Year) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort))
graduate_gen_per <- graduate_gen %>%
group_by(Category, District) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_adregent= sum(num_adregents) /sum(num_cohort),
sum_per_wadregent= sum(num_wadregents)/sum(num_cohort),
sum_per_local = sum(num_local)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort),
sum_per_tasc= sum(num_TASC)/sum(num_cohort))
phist<- ggplot(graduate_gen_phist, aes(x =factor(Cohort_Year) , y = sum_per_grad, fill =Category)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Graduation Rate with respect to Gender over the years", x = "Year", y = "Graduation Rate")
#grid.arrange(math_hist, math_pcp, nrow = 2)
pline<- ggplot(graduate_gen_per , aes(x = District, y = sum_per_grad, col = Category) )+
geom_line()+
scale_x_continuous(breaks = seq(1, 32, 1)) +
ylim(0,1)+
labs(title = "Graduation Rate with Gender and District", x = "District", y = "Graduation Rate")
grid.arrange(phist, pline, nrow = 2)
In order to achieve this bar chart, we summarized the graduation rate grouped by gender and cohort year. From the bar chart, we have known that over the years, the graduation rate of females was better than the one of the males. The rates of both males and females showed an upward trend from 2001 to 2012.
In order to achieve the line graph, we summarized the different graduation indicators grouped by gender and district. According to the line graph, the District 13,12 and 32 had the least disparity of graduation rates between males and females, which means the male and female students perform very similar in these three districts.
(ii) Graduation Indicators by Gender
graduate_gen_tidy <- graduate_gen %>%
group_by(Category) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_adregent= sum(num_adregents) /sum(num_cohort),
sum_per_wadregent= sum(num_wadregents)/sum(num_cohort),
sum_per_local = sum(num_local)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort),
sum_per_tasc= sum(num_TASC)/sum(num_cohort))
graduate_geb_tidy<-graduate_gen_tidy%>%gather(Indicator, Percentage, -Category,-sum_per_grad)
graduate_geb_tidy$Indicator<-factor(graduate_geb_tidy$Indicator,levels=c('sum_per_regent','sum_per_adregent','sum_per_wadregent','sum_per_dropout','sum_per_local','sum_per_sacc','sum_per_tasc'))
We tried to facet wrap the graduation indicator so as to show the different graduation rates with respect to gender after summarizing the graduation rates grouped by category. However, the summarized graduation indicators were in wide forms, thus we had to transform them into the long format by using gather function.
to_string<-as_labeller(c('sum_per_adregent'='Advanced Regent Rate','sum_per_dropout'='Dropout Rate','sum_per_local'='Local Rate','sum_per_regent'='Regent Rate','sum_per_sacc'='SACC Rate','sum_per_tasc'='TASC Rate','sum_per_wadregent'='Without Advanced Regent Rate'))
ggplot(graduate_geb_tidy, aes(x = Category, y = Percentage, fill=Category)) +
geom_bar(stat = "identity") +
facet_wrap(~factor(Indicator),labeller=to_string) +
labs(title = "Various Graduation Indicators by Gender", x = "Gender", y = "Indicator Score",col="Category")+
theme(axis.text.x = element_text(size=10,angle=0))
# theme(legend.position = "bottom")+
# guides(fill=guide_legend(reverse=TRUE))
From the graph above, we noticed that no matter for Regent rates, or Advanced Regent rates or Without Advanced Regents rates, females had an edge over males. For the dropput rate, the rate belonged to females was lower. Nevertheless, in terms of the local rate, the rate of females was slightly bigger than the one of the males.
In conclusion, females perform better than males for most of the graduation indicators, in different districts during the period from 2001 to 2012.
d)Graduation Rate Analysis by Ethinicity
graduate_eth<-read.csv("2016 Graduation_Rates_Public_District_Ethnicity.csv",skip=8,col.names =col_names)
cols<-col_names[6:27]
graduate_eth[graduate_eth == "s"] = NA
graduate_eth[,cols] <- apply(graduate_eth[,cols], 2, function(x) as.numeric(x))
We imported the dataset of graduation indicators by ethnicity, utilizing the techniques we used before including standardizing missing data and transferring the data type of features.
##remove the missing data in per_grads_cohort
graduate_eth<- graduate_eth[complete.cases(graduate_eth),]
graduate_eth <- graduate_eth%>% filter(District!=79)
We implemented the similar data preprocessing stated as above to prepare the data set for analysis. We found out that a few observations are missing on all included variables. Therefore, we decided to remove all the missing observations, since the number of missing observations was rather small compared to the total data set.
Besides, we found out there was an extra district 79, New York City’s Alternative Schools District, included in the data set. Since it was not the focus of our research, we decided to drop the observations of District 79 to avoid the influence.
(i) Graduation Rate Analysis by Ethnicity,District and Year
graduate_eth_heat<- graduate_eth %>%
group_by(District,Category) %>%
# dplyr::summarise(num_grads= sum(num_cohort))
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort))
graduate_eth_heat$District <- factor(graduate_eth_heat$District)
ggplot(graduate_eth_heat, aes(Category,District,fill = sum_per_grad)) +
geom_tile() +
# geom_text(aes(label =round(sum_per_grad,3)), color = "white",size=3) +
theme(axis.text.x = element_text(angle=0))+
scale_fill_viridis()+
labs(title = "The Graduation Rate of Different Ethnicities in Districts",fill="Graduation Rate")
At first, we tried to demonstrate the graduation rate of different ethnicities in districts using a line graph or histogram, but we figured out that the number of categories was too large to explicitly show the differences. Therefore, we decided to utilize heatmap to indicate the graduation rate differences with respect to districts and ethnicities.
From the heatmap above, we found out that there were some missing data for Asian in District 16, Multi-Racial and Native American in a few districts. It demonstrates that Asian and Native American performed the best over the districts, while Native American got the lowest graduation rate especially for District 16 and District 7.
For the Asian group, districts 13, 10, 5 and 4 got the best graduation rate, while districts 23 and 18 got the worst rate.
In terms of the Multi-Racial group, they got the best rate in districts 29, 21, 13, 4 and 3, while they got the worst one in District 27.
For the white group, the graduation rate difference over the districts was the most obvious among the ethnicity groups, with the best ones in District 13 and District 3, and the worst ones in District 23 and District 16.
graduate_eth_line<- graduate_eth %>%
group_by(Cohort_Year,Category) %>%
# dplyr::summarise(num_grads= sum(num_cohort))
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort))
ggplot(graduate_eth_line , aes(x = Cohort_Year, y = sum_per_grad, col = Category) )+
geom_line(size=0.8)+
scale_x_continuous(breaks = seq(2001, 2012, 1))+
scale_color_brewer(palette = "Set1")+
ylim(0,1)+
labs(title = "Graduation Rate with Ethnicity over Years", x = "Year", y = "Graduation Rate")
From the line graph, it indicates that students could be divided into two groups – the first group with relatively high rate includes Asian, White and Multi-Racial, while the second group with lower rate contains Black, Hispanic and Native American.
Over the years, the graduation rates of Asian, White, Black and Hispanic were increasing gradually. However, for Multi-Racial and Native American, there were a few fluctuations over the years. For Multi-Racial, the rate hit the lowest point in 2004 and 2009, while it reaches the peak in 2007 and 2010. As for Native American, 2004 witnessed the climax of the graduation rate.
(ii) Graduation Indicators by Ethnicity
graduate_eth_tidy <- graduate_eth %>%
group_by(Category) %>%
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_adregent= sum(num_adregents) /sum(num_cohort),
sum_per_wadregent= sum(num_wadregents)/sum(num_cohort),
sum_per_local = sum(num_local)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort),
sum_per_tasc= sum(num_TASC)/sum(num_cohort))
Since the number of ethnicity categories was relatively large, we applied the parallel coordinate instead of histogram and facet wrap to analyze the graduation indicators by ethnicity.
glabel<-c('Graduation Rate','Dropout Rate','Regent Rate','Advanced Regent Rate','Without Advanced Regent Rate','Local Rate','SACC Rate','TASC Rate')
ggparcoord(graduate_eth_tidy , columns =2:9,scale="globalminmax",groupColumn = "Category")+
theme(axis.text.x = element_text(size=10,angle=10))+
scale_x_discrete(labels=glabel)+
scale_color_brewer(palette = "Set1")+
ggtitle("Graduation Indicators by Ethnicity ")
For Asian, Multi-Racial and White, they got relatively high graduation rate (over 80%) and low dropout rate(about 10%). For the category Asian, majority of the graduate students got Regent rate, where the Advanced Regent rate was higher than the Without Advanced Regent rate. In terms of Multi-Racial and White students, the students got higher Without Advanced Regent rate compared to Advanced Regent rate.
For the other three groups Black, Hispanic and Native American, the students earned relatively low graduation rate (about 60%), relatively high dropout rate (about 20%) and low Regent rate (about 50%), most of which were belonged to Without Advanced Regent Diploma.
e) Graduation Rate Analysis by SWD
graduate_swd<-read.csv("2016 Graduation_Rates_Public_District_SWD.csv",skip=8,col.names =col_names)
cols<-col_names[6:27]
graduate_swd[graduate_swd == "s"] = NA
graduate_swd[,cols] <- apply(graduate_swd[,cols], 2, function(x) as.numeric(x))
We imported the dataset of graduation indicators by SWD, utilizing the techniques we used before including standardizing missing data and transferring the data type of features.
##remove the missing data in per_grads_cohort
graduate_swd<- graduate_swd[complete.cases(graduate_swd),]
graduate_swd <- graduate_swd%>% filter(District!=79)
We implemented the similar data preprocessing stated as above to prepare the data set for analysis. We found out that a few observations are missing on all included variables. Therefore, we decided to remove all the missing observations, since the number of missing observations was rather small compared to the total data set.
Besides, we found out there was an extra district 79, New York City’s Alternative Schools District, included in the data set. Since it was not the focus of our research, we decided to drop the observations of District 79 to avoid the influence.
graduate_swd_heat<- graduate_swd %>%
group_by(District,Category) %>%
# dplyr::summarise(num_grads= sum(num_cohort))
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort))
graduate_swd_heat$District <- factor(graduate_swd_heat$District)
p1<-ggplot(graduate_swd_heat, aes(Category,District,fill = sum_per_grad)) +
geom_tile() +
# geom_text(aes(label =round(sum_per_grad,3)), color = "white",size=3) +
# theme(axis.text.x = element_text(angle=10))+
scale_fill_viridis()+
labs(title = "The graduation rate by swd in districts",fill="Graduation Rate")
p2<-ggplot(graduate_swd_heat, aes(Category,District,fill = sum_per_sacc)) +
geom_tile() +
# geom_text(aes(label =round(sum_per_sacc,3)), color = "white",size=3) +
# theme(axis.text.x = element_text(angle=10))+
scale_fill_viridis()+
labs(title = "The SACC rate by swd in districts",fill="SACC Rate")
grid.arrange(p1,p2,nrow=1)
SWD means Student with a Disability. Since SACC Certificate (certificate for students with disabilities) was strongly related to the disabled students, we drew two heatmaps for comparison – one is the graduation rate by SWD in districts, and the other is SACC rate with respect to SWD in districts.
From the heatmaps above, we attempted to show the graduation rate and SACC rate with respect to SWD. In terms of the graduation rate, the students without disability perform better than the ones with disability among the districts. As for the SACC rate, District 16 had the largest percentage of students with disability who got the SACC Certificate (certificate for students with disabilities).
(i) Graduation Indicators by SWD
graduate_swd_line<- graduate_swd %>%
group_by(Cohort_Year,Category) %>%
# dplyr::summarise(num_grads= sum(num_cohort))
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_per_dropout= sum(num_dropout)/sum(num_cohort),
sum_per_regent=sum(num_regents) /sum(num_cohort),
sum_per_sacc= sum(num_SACC)/sum(num_cohort))%>%
gather(Indicator, Percentage, -Category,-Cohort_Year)
#graduate_swd_line$Indicator<-factor(graduate_swd_line$Indicator,levels=c('Graduation Rate','Dropout Rate','Regent Rate','SACC Rate'))
ggplot(graduate_swd_line , aes(x = Cohort_Year, y = Percentage, col = Indicator ) )+
geom_line(aes(linetype=Category),size=0.8)+
scale_x_continuous(breaks = seq(2001, 2012, 1)) +
scale_color_brewer(palette = "Set1",labels=c('Dropout Rate','Graduation Rate','Regent Rate','SACC Rate'))+
labs(title = "Graduation Rate with SWD and District", x = "Year", y = "Graduation Rate")
We would like to show the time series of different graduation indicators in regard to SWD category in a line graph, thus we applied gather function after summarizing the values of indicators and established the line graph setting color to indicator and line type to SWD category.
According to lines, the students without disability outperform SWD generally during the years. Over the years, the graduation rate and regent rate were getting better in both groups SWD and students without disability, especially for students without disability, the regent rate was approaching to graduation rate. We also noticed that the rate of SACC certificate was decreasing for students with disability.
f) SAT Analysis
SAT<-read.csv("SAT_Results.csv")
SAT[SAT=="s"]<-NA
SAT[,3:6] <- apply(SAT[,3:6], 2, function(x) as.numeric(x))
SAT <- SAT[complete.cases(SAT),]
We imported the dataset of SAT scores, utilizing the techniques we used before including standardizing missing data and transferring the data type of features.
(i) SAT Average Score Distribution of different schools
g1<-ggplot(data=na.omit(SAT))+
geom_histogram(aes(x=as.numeric(as.character(SAT.Critical.Reading.Avg..Score))),binwidth = 5,fill="papayawhip", colour="khaki4")+
xlab("SAT Critical Reading Avg. Score")+
ggtitle("SAT Average Score Distribution of different schools")
g2<-ggplot(data=na.omit(SAT))+
geom_histogram(aes(x=as.numeric(as.character(SAT.Math.Avg..Score))),binwidth = 5,fill="papayawhip", colour="khaki4")+
xlab("SAT Math Avg. Score")
g3<-ggplot(data=na.omit(SAT))+
geom_histogram(aes(x=as.numeric(as.character(SAT.Writing.Avg..Score ))),binwidth = 5,fill="papayawhip", colour="khaki4")+
xlab("SAT Writing Avg. Score")
grid.arrange(g1,g2,g3,nrow=3)
The three histograms show the SAT average score of different schools, which are all right-skewed. In the plot of SAT Critical Reading Avg. Score, the majority of average score centered around the range [350,420]. On the other hand, the average score of math concentrated around 370, while the score of writing was around [350,400].
(ii) Proportion of SAT Takers over Cohort Students in 32 Districts
SAT_tidy<-SAT%>%mutate(District=substr(as.character(DBN),1,2))%>%select(District,Num.of.SAT.Test.Takers,SAT.Critical.Reading.Avg..Score,SAT.Math.Avg..Score,SAT.Writing.Avg..Score)
SAT_tidy$District<-as.factor(SAT_tidy$District)
SAT_tidy <- SAT_tidy%>% filter(District!=79 & District!=75)%>%
mutate(sum_reading = Num.of.SAT.Test.Takers*SAT.Critical.Reading.Avg..Score,
sum_math = Num.of.SAT.Test.Takers*SAT.Math.Avg..Score,
sum_writing = Num.of.SAT.Test.Takers*SAT.Writing.Avg..Score )%>%
group_by(District)%>%
dplyr::summarise(reading=sum(sum_reading)/sum(Num.of.SAT.Test.Takers),
math = sum(sum_math)/sum(Num.of.SAT.Test.Takers),
writing = sum(sum_writing)/sum(Num.of.SAT.Test.Takers),
num_taker =sum(Num.of.SAT.Test.Takers))%>%
gather(session,score,-District,-num_taker)
SAT_tidy$District <- as.numeric(SAT_tidy$District )
graduate_s<- graduate%>%
group_by(District) %>%
# dplyr::summarise(num_grads= sum(num_cohort))
dplyr::summarise(sum_per_grad= sum(num_grads)/sum(num_cohort),
sum_cohort = sum(num_cohort))
grad_SAT<-merge(graduate_s,SAT_tidy,by="District")
In order to connect the SAT takers of different schools with the 32 districts, we extracted the District part from the DBN code of schools, creating new features “reading”,“math”,“writing” and “num_taker” to indicate the average score of takers in districts and the number of SAT takers in districts. Then we merged the processed SAT dataset with the dataset of graduation rates among districts. The reason why we got to transform the data type of the “District” feature to numeric in SAT_tidy was because the data type of the “District” feature in graduate_s data frame was numeric. In order to merge two data sets, we had to unify the data type of the feature.
Besides, we found out there were extra districts 75 and 79, included in the data set. Since the two districts were not the focus of our research, we decided to drop the observations of District 79 and District 75 to avoid the influence.
SAT_taker<-grad_SAT %>%mutate(per_taker = num_taker/sum_cohort)%>%select(District,per_taker)
SAT_taker<-unique(SAT_taker)
SAT_taker$District<-as.factor(SAT_taker$District)
ggplot(SAT_taker, aes(x = District, y = per_taker)) +
geom_bar(stat = "identity",fill="papayawhip", colour="khaki4") +
labs(title = "Percentage of SAT Takers over Cohort Students in 32 Districts", x = "District", y = "Percentage")+
theme(axis.text.x = element_text(size=10,angle=0))
We utilized mutate function to calculate the percentage of students taking SAT in 32 districts.
The bar chart shows the proportion of the recorded SAT takers within the cohort students in 32 districts. It is worth mentioning that the percentage of SAT takers shown above just refers to the observations of the dataset, but not the real number of individuals taking SAT exams.
(iii) SAT Scores of Different Sessions with respect to District
SAT_tidy$District<-as.numeric(SAT_tidy$District)
SAT_tidy$session<-as.factor(SAT_tidy$session)
ggplot(SAT_tidy)+
geom_line(aes(x = District, y = score, col = session))+
scale_x_continuous(breaks = seq(1, 32, 1)) +
ylim(0,570)+
labs(title = "SAT Scores of Different sessions with District", x = "District", y = "Scores")
According to the line graph, we found out that, among all the districts, the average math scores were highest than the reading scores and the writing scores. Besides, the average reading scores were slightly higher than the writing scores. The district with the best SAT scores was District 13, while the districts with the worst SAT scores were District 12 and District 32. In terms of the disparity among the scores of three sessions, District 20, 26 and 1 got the most evident disparity.
(iv) The Relationship between Graduation Rate and SAT Scores
ggplot(grad_SAT, aes(x=sum_per_grad, y=score)) + geom_point(aes(colour=session))+geom_smooth(aes(colour=session))+
ylim(0,650)+
labs(title = "The Relationship between Graduation Rate and SAT Scores", x = "graduatoin rate", y = "Scores")
According to the plot, it describes the correlation between graduation rate and SAT scores using both scatter plots and smooth lines.
From the slope of the graph, we found out that the SAT scores were slightly positively correlated with graduation rates, which means the higher graduation rate comes along with the higher SAT scores, especially for math scores.
(4) School Analysis
a) School Progress Evaluation
# clean
progress <- read.csv("progress.csv", header=TRUE, stringsAsFactors=FALSE)
progress[progress == "None"] = NA
progress[progress == "Pending"] = NA
progress <- na.omit(progress)
prog <- progress[c(1,4,5,6,7,8)]
colnames(prog) <- c("DB", "10-11","09-10","08-09","07-08","06-07")
prog <- prog %>% gather(Year, Rating, 2:6)
prog[prog == "A"] = 4
prog[prog == "B"] = 3
prog[prog == "C"] = 2
prog[prog == "D"] = 1
prog[prog == "F"] = 0
prog$Rating <- as.numeric(prog$Rating)
prog1 <- unique(prog %>% inner_join(loc))
Joining, by = "DB"
# plot
prog_D <- dplyr::summarize(group_by(prog1,Year,District), mean(Rating))
prog_D$District <- factor(prog_D$District)
colnames(prog_D)[3] <- "Rating"
ggplot(prog_D, aes(y = Year,x = District, fill=Rating)) + geom_raster() + labs(title = " School Progress Rating Mean by District ") +
theme_grey(12)
Progress Reports graded each school with an A, B, C, D, or F and were based on student progress (60%), student performance (25%), and school environment (15%). Scores were based on comparing results from one school to a peer group of up to 40 schools with the most similar student population and to all schools citywide. They were designed to help parents, teachers, principals, and others understand how well schools are doing??and compare them to other, similar schools.
For the dataset of progress reports, we deleted missing data and meaningless data and changed the grade A-F to integer 4-0 to help the data visualization. Since the dataset provided the progress grades of all schools in NYC, we joined it with the data of location information and summrize them by district to get the average rating of each district. we tried to visualize it with parallel coordinates and heatmap and found that it was easier for us to get information and compare ratings between different districts intuitively with heatmap.
According to the figure above, we can find that the rating of all districts in 2008-2009 are remarkably higher than those in other years. This is weird beacause the rating are based on comparing with similar schools. The overall rating of all districts should not change a lot over the years. Therefore, We think there is something wrong with the data of 2008-2009. Without considering the data of 2008-2009, we can find that the progress ratings of district 2,20,26 were remarkably better than other that of those of other districts over the year, which means schools in these districts do better than schools in other districts.
b) School Quality Evaluation
# clean
quality <- read.csv("QualityReview.csv", header=TRUE, stringsAsFactors=FALSE)
colnames(quality) <- c("DBN", "Year", "Rating")
quality <- separate(quality, Year, c("x", "Year"), sep="Y")
quality <- quality[-2]
quality[quality == "U"] = 0
quality[quality == "UD"] = 0
quality[quality == "UPF"] = 1
quality[quality == "D"] = 2
quality[quality == "P"] = 3
quality[quality == "WD"] = 4
quality[quality == "O"] = 5
quality <- quality[-c(2994,3812),]
quality <- transform(quality, Rating = as.numeric(Rating))
quality[quality == " 11-12"] = "11-12"
# plot
qualityData2 <- unique(quality %>% inner_join(loc))
Joining, by = "DBN"
quality_D <- dplyr::summarize(group_by(qualityData2,Year,District), mean(Rating))
colnames(quality_D)[3] <- "Rating"
quality_D <- subset(quality_D, ! District %in% c("0","75") )
quality_D$District <- factor(quality_D$District)
ggplot(quality_D, aes(y = Year,x = District, fill=Rating)) + geom_raster() + labs(title = " School Quality Rating Mean by District ") +
theme_grey(16)
The Quality Review is a process that evaluates how well schools are organized to support student learning and teacher practice. It was developed to assist New York City Department of Education (NYCDOE) schools in raising student achievement by looking behind a school??s performance statistics to ensure that the school is engaged in effective methods of accelerating student learning.
For the quality review dataset, we removed the reduplicated data and changed the results, O, WD, P, etc. to integer, from 0 to 4 make it easier for us to visualize the data set. Since the dataset provided the quality grades of all schools in NYC, we joined it with the data of location information and summrize them by district to get the average rating of each district. Compared with parallel coordinates and histogram, heatmap could show the change of quality ratings over years explicitly and is convenient for us to compare the quality ratings between different districts.
From the figure, we can find that the overall rating of all districts were decreasing over these years. This could be resulted from that along with the development of society, people’s requirement level are increasing. Besides, we can find that the quality ratings of district 2,20,26 were remarkably better than other that of those of other districts over the year, which means schools in these districts are engaged in more effective methods of accelerating student learning. Combining the information from this figure with that from the above figure, we can conclude that schools in district 2,20,26 are good choice for parents and children.